# Figure_A1.R

# Part of the replication archive for 
#
#   Bullock, John G., and Kelly Rader. 2021. "Response Options and the 
#   Measurement of Political Knowledge." Forthcoming in the British Journal 
#   of Political Science.

library(Bullock)  # for PDF_crop(), qw()
library(dplyr)    # for %>%, tibble()
library(Hmisc)    # for Dotplot()
library(ipumsr)   # for read_ipums_ddi(), read_ipums_micro()
library(survey)   # for survey weights
library(tidyr)    # for pivot_longer()


source(here::here("R/SSI_2017_coding.R"))
filenameStem <- here::here("float_output/Figure_A1")



# **************************************************************************
# CODE DATA FROM OUR EXPERIMENT ####
# **************************************************************************
region    <- setNames(state.region, state.name)[as.character(state)] 
SSI_demog <- data.frame(
    psid   = as.character(originalData$psid), 
    age, 
    race   = raceCoalesced, 
    educ, 
    region,
    weight = originalData$weight) %>% 
  na.omit()


age_under30 <- SSI_demog$age < 30
age_over60  <- SSI_demog$age > 60 
race_white  <- SSI_demog$race == 'White'
race_black  <- SSI_demog$race == 'African American'
race_HispanicOrLatino <- SSI_demog$race == 'Hispanic'
educ_HS_orLess <- SSI_demog$educ <= '12thDiploma'
educ_BA_orMore <- SSI_demog$educ >= "Bachelor's"
region_NorthCentral <- SSI_demog$region == 'North Central'
region_Northeast    <- SSI_demog$region == 'Northeast'
region_South        <- SSI_demog$region == 'South'
region_West         <- SSI_demog$region == 'West'



# **************************************************************************
# IMPORT CPS DATA ####
# **************************************************************************
CPS_2017_DDI <- read_ipums_ddi(here::here('data/CPS_2017/cps_00002.xml'))  
CPS_2017_original <- read_ipums_micro(CPS_2017_DDI)
CPS_2017 <- CPS_2017_original %>%
  select(CPSIDP, WTFINL, AGE, SEX, RACE, HISPAN, REGION, STATECENSUS, EDUC) %>%
  as_factor() %>%
  mutate(
    AGE = ordered(AGE),
    EDUC = ordered(EDUC),
    HISPANIC_OR_LATINO = HISPAN %in% c("Mexican", "Dominican", "Puerto Rican", "Cuban", "Central American, (excluding Salvadoran)", "Salvadoran", "South American"), 
    STATECENSUS = as.character(STATECENSUS),
    REGION4 = setNames(state.region, state.name)[STATECENSUS])
CPS_2017$REGION4[CPS_2017$STATECENSUS == "District of Columbia"] <- "South"

# Check region data
if (!sourcing()) {
  table(CPS_2017$REGION4)
  lNA(CPS_2017$REGION4)
  lNA(CPS_2017$STATECENSUS)
  CPS_2017 %>% filter(is.na(REGION4) & !is.na(STATECENSUS)) %>% { table(.$STATECENSUS) }
  CPS_2017 %>% filter(STATECENSUS == "California") %>% { table(.$REGION4) }
}



# **************************************************************************
# CALCULATE CPS PROPORTIONS ####
# **************************************************************************

# SELECT ONLY THE NEEDED COLUMNS
CPS_forProps <- within(CPS_2017,{
  agecat  <- as.character(Recode(AGE, "17:29='under 30'; 30:60='30 to 60'; 61:85='over 60'"))
  racecat <- as.character(Recode(RACE, "'White'='White'; 'Black/Negro'='African American'; else='Other'"))
  racecat[HISPAN!="Not Hispanic"&racecat!="African American"]  <- "Hispanic"
  educat  <- as.character(Recode(EDUC, "c('Some college but no degree',\"Associate's degree, occupational/vocational program\",\"Associate's degree, academic program\")='college no BA';
                c(\"Bachelor's degree\",\"Master's degree\",'Professional school degree','Doctorate degree')='BA or more';
                else='HS or less'"))
  regcat <- as.character(REGION4)
  femcat <- as.numeric(SEX)-1
  
})
CPS_forProps <- subset(
  x      = CPS_forProps, 
  select = c(agecat, racecat, educat, regcat, femcat, WTFINL))
  
  
# WEIGHT THE CPS DATA  

# create design object for CPS with CPS weights
CPS_des_wt <- svydesign(
  ids     = ~1, 
  data    = CPS_forProps, 
  weights = CPS_forProps$WTFINL)

# get weighted CPS frequencies
CPS_freq_wt <- list(
  svytable(~agecat,  design = CPS_des_wt),
  svytable(~racecat, design = CPS_des_wt),
  svytable(~educat,  design = CPS_des_wt),
  svytable(~regcat,  design = CPS_des_wt),
  svytable(~femcat,  design = CPS_des_wt))


# CREATE VECTOR OF CPS PROPORTIONS
CPS_2017_summary <- c(
  age_under_30          = prop.table(svytable(~agecat,  design = CPS_des_wt))['under 30'],
  age_over60            = prop.table(svytable(~agecat,  design = CPS_des_wt))['over 60'],
  female                = prop.table(svytable(~femcat,  design = CPS_des_wt))['1'],
  race_black            = prop.table(svytable(~racecat, design = CPS_des_wt))['African American'],
  race_HispanicOrLatino = prop.table(svytable(~racecat, design = CPS_des_wt))['Hispanic'],
  race_white            = prop.table(svytable(~racecat, design = CPS_des_wt))['White'],
  educ_HS_orLess        = prop.table(svytable(~educat,  design = CPS_des_wt))['HS or less'],
  educ_BA_orMore        = prop.table(svytable(~educat,  design = CPS_des_wt))['BA or more'],
  region_NorthCentral   = prop.table(svytable(~regcat,  design = CPS_des_wt))['North Central'],
  region_Northeast      = prop.table(svytable(~regcat,  design = CPS_des_wt))['Northeast'],
  region_South          = prop.table(svytable(~regcat,  design = CPS_des_wt))['South'],
  region_West           = prop.table(svytable(~regcat,  design = CPS_des_wt))['West'])



# **************************************************************************
# CREATE THE DATA FRAME THAT WE USE FOR PLOTTING ####
# **************************************************************************
# We plot the percentages of the SSI sample that have given characteristics.
# To do so, we use listwise deletion.  
sampleCharacteristics <- tibble(
    row = (length(CPS_2017_summary)):1,
    variable = c(
      "age: under 30 years old",
      "age: over 60 years old",
      "female",
      "black",
      "Hispanic or Latino",
      "white",
      "education: high school or less",
      "education: bachelor's degree or more",
      "region: North Central",
      "region: Northeast",
      "region: South",
      "region: West"),
    percentageSSI = c(
      meanNA(age_under30),
      meanNA(age_over60),
      meanNA(female),
      meanNA(race_black),
      meanNA(race_HispanicOrLatino),
      meanNA(race_white),
      meanNA(educ_HS_orLess),
      meanNA(educ_BA_orMore),
      meanNA(region_NorthCentral),
      meanNA(region_Northeast),
      meanNA(region_South),
      meanNA(region_West)),
    percentageCPS = CPS_2017_summary) %>%
  pivot_longer(
    cols = c(percentageSSI, percentageCPS),
    names_to = "sample",
    names_prefix = "percentage",  # e.g., change "percentageSSI" to "SSI" in the "sample" column
    values_to = "percentage")

  

# **************************************************************************
# PANEL FUNCTION ####
# **************************************************************************
sampleCharacteristics_panel <- function(...) {
    panel.dotplot(...)
    panel.axis(
      side   = "top",
      at     = xAxis$at,
      labels = FALSE,
      half   = FALSE,
      tck    = tickSize * .5)
}



# **************************************************************************
# FIGURE PARAMETERS ####
# **************************************************************************
xAxis <- list(
  labels = seq(10, 70, by = 10) %>% paste0("%"),
  at     = seq(.1, .7, by = .1),
  tck    = c(.5, 0))
yAxis <- list(
  labels = unique(sampleCharacteristics$variable),
  at     = unique(sampleCharacteristics$row),
  tck    = c(0, 0))
panelHeight <- 7  # inches
panelWidth  <- 4  # inches 
tickSize    <- 1  # cex



# **************************************************************************
# CREATE THE FIGURE ####
# **************************************************************************
sampleCharacteristics_plot <- dotplot(
  row ~ percentage, 
  groups = sample, 
  data = sampleCharacteristics,
  panel = sampleCharacteristics_panel,
  pch = qw("C S"),
  col = "black",
  cex = 1,
  xlim = c(.05, .75),
  xlab = "Sample Percentages vs. U.S. Population Percentages",
  ylab = "",
  scales = list(x = xAxis, y = yAxis))

pdf(
  file   = paste0(filenameStem, '.pdf'), 
  width  = 12, 
  height = 12, 
  paper  = "special", 
  title  = "Sample Characteristics (SSI 2017)",
  bg     = 'transparent')

  trellis.par.set(
    axis.components = list(bottom = list(pad1 = 0.75)),
    layout.heights  = list(xlab = 2))
  
  print(
    sampleCharacteristics_plot, 
    panel.width = list(panelWidth, "inches"), 
    panel.height = list(panelHeight, "inches"))

dev.off()
PDF_crop(filenameStem)



# **************************************************************************
# AUXILIARY INFORMATION ####
# **************************************************************************
# "Of the twelve categories examined in \autoref{FigSSISampleCharacteristics}, 
# there are nine for which the SSI and CPS samples are within 1.5 percentage 
# points of each other..."  
if (!sourcing()) {
sampleCharacteristics %>% 
  pivot_wider(names_from = "sample", values_from = "percentage") %>%
  mutate(absDiff = abs(SSI - CPS)) %>%
  arrange(absDiff)
}